annotation_air <- "dex"
annotation_mac <- "condition_name"
diffexpMethod <- "limma"Comparing two datasets with Venn Diagram
Here is an example of an application for the function plot_venn. For this script, we used 2 different public datasets airway and macrophage.
The airway dataset provides a gene expression dataset derived from human bronchial epithelial cells, treated or not with dexamethasone (a corticosteroid).
The macrophage dataset provides bulk RNA-seq count data from human monocyte-derived macrophages. These cells were either left untreated or stimulated with interferon-gamma (IFN-γ). An experimental condition was applied to each of these cells, left unstimulated, stimulated with IFN-gamma, infected with SL1344, or pre-treated with IFN-g and then infected with SL1344. For this comparison, we will focus on the samples of the 2 first conditions (IFN-g, naive).
To compare the datasets, we will do the pre-processing part of the sup and unsup analysis, until the diffexp.
Import libraries
library(airway)
library(SummarizedExperiment)
library(macrophage)
library(biomaRt)
library(CAIBIrnaseq)Load datasets
#Loading the 2 datasets
data(airway, package="airway");
airway <- airway
data("gse")
macrophage <- gse; rm(gse)
macrophage <- macrophage[,macrophage$condition %in% c("naive", "IFNg")] # focus on these 2 conditionsEven if the datasets are globally build the same way, the names of the variables are not exactly the same, so if we want to keep the same code, we need to redefine a bit the variables.
If you want to know what are the used variables in this part, run this command line : colnames(SummarizedExperiment::rowData(exp_data)) You should have these variables (with these exact same names): - gene_name - gene_id - gene_length_kb - gene_description - gene_biotype
If not, you should look at how your dataset is defined. You might need to run some command line as the following ones:
rowData(airway)$gene_length_kb <-(rowData(airway)$gene_seq_end - rowData(airway)$gene_seq_start) / 1000
mart_air <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
gene_ids_air <- rowData(airway)$gene_id
annot_air <- getBM(attributes = c("ensembl_gene_id", "description"),
filters = "ensembl_gene_id",
values = gene_ids_air,
mart = mart_air)
matched_air <- match(rowData(airway)$gene_id, annot_air$ensembl_gene_id)
rowData(airway)$gene_description <- annot_air$description[matched_air]rowData(macrophage)$gene_length_kb <- width(rowRanges(macrophage)) / 1000
rownames(macrophage) <- sub("\\..*$", "", rowData(macrophage)$gene_id)
rowData(macrophage)$gene_id <- sub("\\..*$", "", rowData(macrophage)$gene_id)
rowData(macrophage)$gene_name <- rowData(macrophage)$SYMBOL
macrophage <- macrophage[!is.na(rowData(macrophage)$SYMBOL), ]
mart_mac <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
annot_mac <- getBM(
attributes = c("ensembl_gene_id", "gene_biotype", "description"),
filters = "ensembl_gene_id",
values = unique(rowData(macrophage)$gene_id),
mart = mart_mac
)
# Correspondance entre gene_id et annot_mac
match_mac <- match(rowData(macrophage)$gene_id, annot_mac$ensembl_gene_id)
# Ajouter les colonnes dans rowData
rowData(macrophage)$gene_biotype <- annot_mac$gene_biotype[match_mac]
rowData(macrophage)$gene_description <- annot_mac$description[match_mac]Pre-processing
airway <- rebase_gexp(airway, annotation = "gene_name")macrophage <- rebase_gexp(macrophage, annotation = "SYMBOL") # or gene_name if you renamed the variable Filter
airway <- filter_gexp(airway,
min_nsamp = 1,
min_counts = 1)macrophage <- filter_gexp(macrophage,
min_nsamp = 1,
min_counts = 1)Quality Control
colData(airway)$sample_id <- colnames(airway)
plot_qc_filters(airway)Saving 7 x 5 in image
Warning: Removed 8 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
colData(macrophage)$sample_id <- colnames(macrophage)
plot_qc_filters(macrophage)Saving 7 x 5 in image
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
geom_GeomTextRepel() has yet to be implemented in plotly.
If you'd like to see this geom implemented,
Please open an issue with your example code at
https://github.com/ropensci/plotly/issues
Normalize
airway <- normalize_gexp(airway)
assay(macrophage) <- round(assay(macrophage))
macrophage <- normalize_gexp(macrophage)PCA
#|message: false
metadata(airway)[["pca_res"]] <- pca_gexp(airway)
annotations <- setdiff(annotation_air, c("exp_cluster", "path_cluster"))
plot_pca(airway, color = annotation_air)Saving 7 x 5 in image
metadata(macrophage)[["pca_res"]] <- pca_gexp(macrophage)
annotations <- setdiff(annotation_mac, c("exp_cluster", "path_cluster"))
plot_pca(macrophage, color = annotation_mac)Saving 7 x 5 in image
Diffexp
library(edgeR)
colData(airway)$dex <- factor(colData(airway)$dex)
colData(airway)$dex <- factor(colData(airway)$dex, levels = c("untrt", "trt"))
diffexp_air <- diffExpAnalysis(countData = assays(airway)$counts,
sampleInfo = colData(airway),
method = diffexpMethod, cutoff = 10,
annotation = annotation_air)colData(macrophage)$condition_name <- factor(colData(macrophage)$condition_name)
colData(macrophage)$condition_name <- factor(colData(macrophage)$condition_name, levels = c("IFNg", "naive"))
diffexp_mac <- diffExpAnalysis(countData = assays(macrophage)$counts,
sampleInfo = colData(macrophage),
method = diffexpMethod, cutoff = 10,
annotation = annotation_mac)Venn Diagram
#|message: false
if(tolower(diffexpMethod) == "limma") {
diffexp_air <- diffexp_air |>
dplyr::rename(
log2FoldChange = logFC,
pvalue = P.Value,
padj = adj.P.Val
)
}
if(tolower(diffexpMethod) == "limma"){
diffexp_mac <- diffexp_mac |>
dplyr::rename(
log2FoldChange = logFC,
pvalue = P.Value,
padj = adj.P.Val
)
}
# Exemple : extraction des gènes DE significatifs
degs_air <- rownames(diffexp_air[diffexp_air$padj < 0.05, ])
degs_mac <- rownames(diffexp_mac[diffexp_mac$padj < 0.05, ])
universe_size <- length(union(degs_air, degs_mac))
# Diagramme de Venn
plot_venn(degs_air, degs_mac, universe_size,
v1_name = "Airway", v2_name = "Macrophage",
fills = c("lightgreen", "skyblue"),
title = "DEGs communs et spécifiques")inter <- intersect(degs_air, degs_mac)
msigdbr::msigdbr_collections() |>
kableExtra::kbl() |>
kableExtra::kable_styling() |>
kableExtra::scroll_box(height = "300px")| gs_collection | gs_subcollection | gs_collection_name | num_genesets |
|---|---|---|---|
| C1 | Positional | 302 | |
| C2 | CGP | Chemical and Genetic Perturbations | 3494 |
| C2 | CP | Canonical Pathways | 19 |
| C2 | CP:BIOCARTA | BioCarta Pathways | 292 |
| C2 | CP:KEGG_LEGACY | KEGG Legacy Pathways | 186 |
| C2 | CP:KEGG_MEDICUS | KEGG Medicus Pathways | 658 |
| C2 | CP:PID | PID Pathways | 196 |
| C2 | CP:REACTOME | Reactome Pathways | 1736 |
| C2 | CP:WIKIPATHWAYS | WikiPathways | 830 |
| C3 | MIR:MIRDB | miRDB | 2377 |
| C3 | MIR:MIR_LEGACY | MIR_Legacy | 221 |
| C3 | TFT:GTRD | GTRD | 505 |
| C3 | TFT:TFT_LEGACY | TFT_Legacy | 610 |
| C4 | 3CA | Curated Cancer Cell Atlas gene sets | 148 |
| C4 | CGN | Cancer Gene Neighborhoods | 427 |
| C4 | CM | Cancer Modules | 431 |
| C5 | GO:BP | GO Biological Process | 7608 |
| C5 | GO:CC | GO Cellular Component | 1026 |
| C5 | GO:MF | GO Molecular Function | 1820 |
| C5 | HPO | Human Phenotype Ontology | 5653 |
| C6 | Oncogenic Signature | 189 | |
| C7 | IMMUNESIGDB | ImmuneSigDB | 4872 |
| C7 | VAX | HIPC Vaccine Response | 347 |
| C8 | Cell Type Signature | 840 | |
| H | Hallmark | 50 |
pathway_collections <- c("CP:KEGG_MEDICUS")
pathways <- get_annotation_collection(pathway_collections,
species = "human")-- Collecting CP:KEGG_MEDICUS from MSigDB...
dir.create(file.path("results", "airway", "pathways"), recursive = TRUE, showWarnings = FALSE)
# airway
pathway_scores <- score_pathways(airway, pathways, verbose = FALSE)! Duplicated gene IDs removed from gene set KEGG_MEDICUS_PATHOGEN_HTLV_1_TAX_TO_NFY_MEDIATED_TRANSCRIPTION
! Duplicated gene IDs removed from gene set KEGG_MEDICUS_REFERENCE_ANTIGEN_PROCESSING_AND_PRESENTATION_BY_MHC_CLASS_II_MOLECULES
metadata(airway)[["pathway_scores"]] <- pathway_scores
collections <- pathway_collections |>
paste(collapse = "_") |>
stringr::str_remove("\\:")
plot_pathway_heatmap(airway, annotations = annotation_air,
fwidth = 9,
fname = stringr::str_glue(
"results/airway/pathways/hm_paths_{collections}_top20.pdf")
)Warning in prep_scores_hm(exp_data, pathway_scores, pathways): 'sample_id'
already exists in colData and will be overwritten.
-- Saving heatmap at results/airway/pathways/hm_paths_CPKEGG_MEDICUS_top20.pdf
write.csv(pathways, file = stringr::str_glue("results/airway/pathways/paths_{collections}.csv"))